MatMulFusion ================= 矩阵乘法融合(可选偏置和激活),计算: .. math:: C = \operatorname{act}(A \times B + \text{bias}) 其中 ``A`` 形状为 :math:`M\times K`,``B`` 为 :math:`K\times N`,``C`` 与可选的 ``bias`` 为 :math:`M\times N`。 激活 ``act`` 支持: - ``0``: 无激活(Identity) - ``1``: ReLU - ``2``: ReLU6 输入: - **A** - 输入矩阵 A(行优先,连续存储)。大小 M×K。 - **B** - 输入矩阵 B(行优先,连续存储)。大小 K×N。 - **bias** - 偏置矩阵(可为 NULL)。当 ``bias_broadcast=1`` 时大小为 N,否则为 M×N。 - **params** - 参数打包成数组(共7个元素): - **params[0] (M)** - 维度参数。 - **params[1] (N)** - 维度参数。 - **params[2] (K)** - 维度参数。 - **params[3] (activation_type)** - 激活类型,取值 {0,1,2}。 - **params[4] (A_transpose)** - A矩阵是否转置,取值 {0,1}。 - **params[5] (B_transpose)** - B矩阵是否转置,取值 {0,1}。 - **params[6] (bias_broadcast)** - 偏置是否广播,取值 {0,1}。 - **core_mask(可选)** - 核掩码(仅适用于共享存储版本)。 输出: - **C** - 输出矩阵(行优先,大小 M×N)。 支持平台: ``FT78NE`` ``MT7004`` .. note:: - FT78NE 支持int8, int16, int32, fp32, fp64, cplx64, cplx128 - MT7004 支持fp16, fp32, int16, int32, cplx64 - 复数类型的激活逐分量应用于实部与虚部 - 请确保输入按行优先连续布局,且不发生类型范围溢出;int8/int16/int32 计算未做饱和裁剪 - 转置操作通过参数控制,无需预先转置矩阵 **共享存储版本:** .. c:function:: void i8_matmul_fusion_s(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params, int core_mask) .. c:function:: void i16_matmul_fusion_s(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params, int core_mask) .. c:function:: void i32_matmul_fusion_s(int *A, int *B, int *C, int *bias, long long *params, int core_mask) .. c:function:: void hp_matmul_fusion_s(half *A, half *B, half *C, half *bias, long long *params, int core_mask) .. c:function:: void fp_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask) .. c:function:: void dp_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask) .. c:function:: void c64_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask) .. c:function:: void c128_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 46 #include #include int main(int argc, char* argv[]) { float* A_ref = (float*)0x90000000; float* B_ref = (float*)0x91000000; float* C_ref = (float*)0x92000000; float* bias_ref = (float*)0x93000000; float* C_multi = (float*)0x95000000; int core_mask = 0b1111; // 使用4核 int M = 64; int N = 64; int K = 64; bool bias_broadcast = true; bool A_transpose = false; bool B_transpose = true; // Initialize test data (core 0 only) if (coreid == 0) { // Initialize A, B, bias with small values for (int i = 0; i < M * K; ++i) { A_ref[i] = (float)(i % 10) * 0.1f; } for (int i = 0; i < K * N; ++i) { B_ref[i] = (float)(i % 10) * 0.1f; } for (int i = 0; i < M * N; ++i) { C_ref[i] = 0.0f; C_multi[i] = 0.0f; bias_ref[i] = (float)(i % 5) * 0.01f; } } long long params[7]; params[0] = (long long)M; params[1] = (long long)N; params[2] = (long long)K; params[3] = (long long)ACTIVATION_RELU; params[4] = (long long)A_transpose; params[5] = (long long)B_transpose; params[6] = (long long)bias_broadcast; fp_matmul_fusion_s(A_ref, B_ref, C_multi, bias_ref, params, core_mask); return 0; } **私有存储版本:** .. c:function:: void i8_matmul_fusion_p(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params) .. c:function:: void i16_matmul_fusion_p(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params) .. c:function:: void i32_matmul_fusion_p(int *A, int *B, int *C, int *bias, long long *params) .. c:function:: void hp_matmul_fusion_p(half *A, half *B, half *C, half *bias, long long *params) .. c:function:: void fp_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params) .. c:function:: void dp_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params) .. c:function:: void c64_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params) .. c:function:: void c128_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params) **C调用示例:** .. code-block:: c :linenos: :emphasize-lines: 43 #include #include int main(int argc, char* argv[]) { float* A_ref = (float*)0x10010000; float* B_ref = (float*)0x10020000; float* C_ref = (float*)0x10030000; float* bias_ref = (float*)0x10040000; float* C_single = (float*)0x10050000; int M = 8; int N = 8; int K = 8; bool bias_broadcast = true; bool A_transpose = true; bool B_transpose = false; // Initialize A, B, bias with small values for (int i = 0; i < M * K; ++i) { A_ref[i] = (float)(i % 10) * 0.1f; } for (int i = 0; i < K * N; ++i) { B_ref[i] = (float)(i % 10) * 0.1f; } for (int i = 0; i < M * N; ++i) { C_ref[i] = 0.0f; C_single[i] = 0.0f; bias_ref[i] = (float)(i % 5) * 0.01f; } long long params[7]; params[0] = (long long)M; params[1] = (long long)N; params[2] = (long long)K; params[3] = (long long)ACTIVATION_RELU; params[4] = (long long)A_transpose; params[5] = (long long)B_transpose; params[6] = (long long)bias_broadcast; fp_matmul_fusion_p(A_ref, B_ref, C_single, bias_ref, params); return 0; }